
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

ProgressBar().register()

rais_workers = pd.read_csv('data/rais_matched_firm_out.csv')

rais_2014 = dd.read_csv('rais/rais_2014.csv',
                        dtype={'radiccnpj': 'object',
                               'numectps': 'object'})

rais_2019 = dd.read_csv('rais/rais_2019.csv',
                        dtype={'radiccnpj': 'object',
                               'numectps': 'object'})

rais_2014 = rais_2014.assign(year=2014)

rais_2019 = rais_2019.assign(year=2019)

rais = rais_2014.append(rais_2019)

cols = list(set(rais.columns)-set(rais_workers.columns))

cols = [i for i in cols if 'Unnamed' not in i]

cols.extend(['CPF', 'year'])

rais = rais[cols]

rais_workers = dd.merge(rais_workers, rais, on=['CPF', 'year'])

rais_workers = rais_workers.compute()

rais_workers = rais_workers[-rais_workers['tpvinculo'].str.contains('EST')]

rais_workers = rais_workers[rais_workers['empem3112'] == "Sim"]

rais_workers = rais_workers.sort_values(by='tempempr', ascending=False)

rais_workers = rais_workers.drop_duplicates(subset=['CPF', 'year'])

rais_workers.to_csv('data/rais_matched.csv')
